### Ariel White and Kris-Stella Trump
### File for aggregating 311 data and merging it with tract- and precinct-level indicators for analysis.
### Note that the merged and aggregated data (the output from this code) is also available in the replication files.  
### File posted January 2017
### Created in R version 3.1.3

#Start clean with right working directory
rm(list=ls())
#setwd("/home/ariel/Dropbox (MIT)/final311repdata_forposting")
#setwd("C:/Users/Ariel White/Dropbox (MIT)/final311repdata_forposting") #windows computer filepath for testing.
#setwd("/nfs/home/A/awhite/shared_space/subs_hsg/final311repdata_forposting")
# (Set your own working directory)

#Load required packages
library(data.table)
library(foreign)

#Read 311 data file (download of the full dataset of 311 calls available at https://data.cityofnewyork.us/Social-Services/311-Service-Requests-from-2010-to-Present/erm2-nwe9 on Oct 23 2014)
nycsample1 <- data.table(read.csv("311_Service_Requests_from_2010_to_Present.csv", stringsAsFactors=F)) # nrow=10000, stringsAsFactors=F)
head(nycsample1); dim(nycsample1)

###########################################
#1. Characterize 311 data
###########################################

#Recode date and plot call volume by date
nycsample1[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
nycsample1[, callday := as.character(format(calldate, '%m/%d/%Y'))]

nycsample1[, call :=1]
dates <- nycsample1[, list(numbercalls = sum(call)), by=c("callday")]
setkey(dates, "callday")
dates[, calldate := as.IDate(callday,'%m/%d/%Y')]
setkey(dates, "calldate")

plot(dates$numbercalls) #zoom in on jump
dates[, index := .I]
plot(dates$index,dates$numbercalls, main = "Daily Call Volume from full NYC 311 dataset", ylab="Calls/day", xlab="Date", xaxt='n', xlim=c(2350,2370)) 
#Shows sudden increase in number of calls on 1/1/2010. Probably means that dataset incomplete before this date ( http://www.nytimes.com/2010/05/16/nyregion/16three11.html?pagewanted=all&_r=0 shows that 311 opened 2003, 100m calls by 2010)

#how many calls were placed before 2010
pre2010 <- dates[dates$index<2363,]
sum(pre2010$numbercalls) #just 70316 calls recorded in this data before 2010

#Drop all calls before 2010. 
nycsample1 <- nycsample1[calldate>="2010-01-01",]
#now have 8076498, down from 8146814 calls

#also drop some extraneous columns
drops <- colnames(nycsample1)[32:48] 
nycsample1[,drops] <- NULL; dim(nycsample1)

#plot call density, updating dates list
dates <- nycsample1[, list(numbercalls = sum(call)), by=c("calldate")]
setkey(dates, "calldate")
dates[, index := .I]

pdf("NYC311_callvolume_2010on.pdf")
plot(dates$index,dates$numbercalls, main = "Daily Call Volume to NYC 311 Service, 2010-2014", ylab="Calls/day", xlab="Date", xaxt='n') 
#add axis with ticks at 6 month intervals
Axis(side=1, at=c(1,366, 731,1097,1462), labels=c("2010/1/1","2011/1/1", "2012/1/1", "2013/1/1","2014/1/1"))
#add some dashed vertical lines at 6-month intervals too (seasonality)
abline(v=183, col=gray(.50), lty=4); abline(v=548, col=gray(.50), lty=4); 
abline(v=913, col=gray(.50), lty=4); abline(v=1278, col=gray(.50), lty=4); abline(v=1643, col=gray(.50), lty=4)
dev.off()

#what was mean number of calls?
mean(dates$numbercalls)

#output a table of the different types of calls they take after 2010
calltypes <- table(nycsample1$Complaint.Type)
m <- as.data.frame(calltypes); head(m); dim(m)
write.csv(m, "NYCcalltypes_post2010.csv")

#make a table displaying 25 most common call types
sorted <- m[ order(-m[,2]), ]
top25 <- sorted[1:25,]
colnames(top25) <- c("Complaint", "Frequency")
top25$Frequency <- prettyNum(top25$Frequency,big.mark=",",scientific=F)
library(xtable)
print(xtable(top25, caption="Top 25 Call Types, January 2010-October 2014.  Complaint types are presented verbatim (including capitalization) from the 311 dataset."), file="top25NYCcomplaints_2010on.tex", include.rownames=F)


###########################################
# 2. Aggregating calls to the tract level
###########################################
#Load packages for geographic data analysis
library(maps)
library(maptools)
library(sp)
library(rgdal)

#Read in tract shapefiles from the Census Bureau https://www.census.gov/geo/maps-data/data/cbf/cbf_tracts.html accessed 17 January 2014.
#extract current working directory
currwd <- getwd()
setwd(paste(currwd,"/NYtractshapefiles",sep=""))
NYCtracts <- readOGR(dsn=".", layer = "tl_2010_36_tract10")
setwd(paste(currwd))
summary(NYCtracts); proj4string(NYCtracts)

###Create typologies: public and street-level calls.
#this code was used to output a table of the different types of calls they take (all 239 of them) for coding into public and street-level categories. 
#calltypes <- table(nycsample1$Complaint.Type)
#m <- as.data.frame(calltypes); head(m); dim(m)
#write.csv(m, "NYCcalltypes.csv")

#load the coded table that indicates which calls were "public" (we did this manually)
codes <- read.csv("NYCcalltypes_coded.csv")
names(codes)[2] <-"Complaint.Type"
#merge, giving each call a type.
nycsample1 <- merge(nycsample1, codes, by="Complaint.Type")
nycsample2 <- nycsample1[nycsample1$Public==1,]
dim(nycsample2); dim(nycsample1)
#before dropping non-geocoded: full sample 8.08 million, public calls 7.58 million

#create the "Street-level" calls subset
nycsample3 <- nycsample1[grepl("Traffic Signal", nycsample1$Complaint.Type)==T | grepl("Street Sign", nycsample1$Complaint.Type)==T | grepl("Highway Sign", nycsample1$Complaint.Type)==T | grepl("Broken Parking Meter", nycsample1$Complaint.Type)==T | grepl("Broken Muni Meter", nycsample1$Complaint.Type)==T | grepl("Sidewalk Condition", nycsample1$Complaint.Type)==T | grepl("Street Condition", nycsample1$Complaint.Type)==T | grepl("Bike Rack Condition", nycsample1$Complaint.Type)==T | grepl("Curb Condition", nycsample1$Complaint.Type)==T | grepl("Street Light Condition", nycsample1$Complaint.Type)==T | grepl("Bridge Condition", nycsample1$Complaint.Type)==T | grepl("Sewer", nycsample1$Complaint.Type)==T| grepl("Bus Stop Shelter Placement", nycsample1$Complaint.Type)==T | grepl("Graffiti", nycsample1$Complaint.Type)==T, ]
dim(nycsample1); dim(nycsample3) #street calls 1.69million
nrow(nycsample3)/nrow(nycsample1) #retained 21% of calls 

###Remove non-geocoded places for the three samples
#first run basic comparison: call descriptions with geocodes to call descriptions without geocodes
nycsample1$nogeocode <- is.na(nycsample1$Latitude) | is.na(nycsample1$Longitude)
summary(nycsample1$nogeocode) #689848 calls lack geocode, 7386650 have geocode
sum(nycsample1$nogeocode)/nrow(nycsample1) #8.5% missing gocode
#What type of calls are non-geocoded, and which are disproportionately non-geocoded?
nycsample1$complaint <- as.factor(nycsample1$Complaint.Type)
nycsample.nogeo <- nycsample1[nycsample1$nogeocode==1,]
summary(nycsample.nogeo$complaint)
#most common non-coded public complaints are about streetlights, traffic signals and street conditions
nogeo.table <- table(nycsample1$complaint,nycsample1$nogeocode)
prop.table(nogeo.table,2)                              
#many small differences, but notably traffic signal and street light condition are more likely to be un-geocoded than geocoded by a factor of 10. 

#Drop calls without geocodes.
#Full sample
nycsample1 <- subset(nycsample1, (is.na(nycsample1$Latitude)==F & is.na(nycsample1$Longitude)==F))

#Public calls
nycsample2$nogeocode <- is.na(nycsample2$Latitude) | is.na(nycsample2$Longitude)
sum(nycsample2$nogeocode)/nrow(nycsample2) #6.8% missing geocode
nycsample2 <- subset(nycsample2, (is.na(nycsample2$Latitude)==F & is.na(nycsample2$Longitude)==F))

#Street-level calls
nycsample3$nogeocode <- is.na(nycsample3$Latitude) | is.na(nycsample3$Longitude)
sum(nycsample3$nogeocode)/nrow(nycsample3) #22.6% missing geocode
nycsample3 <- subset(nycsample3, (is.na(nycsample3$Latitude)==F & is.na(nycsample3$Longitude)==F))


##Assign tract ID's to calls
#Full sample
coordinates(nycsample1) <- c("Longitude", "Latitude")
proj4string(nycsample1) <- CRS("+proj=longlat")
nycsampleT1 <- spTransform(nycsample1, CRS(proj4string(NYCtracts)))

class(nycsampleT1); class(NYCtracts)
join1 <- over(nycsampleT1, NYCtracts)
dim(join1); head(join1)

#Public calls
coordinates(nycsample2) <- c("Longitude", "Latitude")
proj4string(nycsample2) <- CRS("+proj=longlat")
nycsampleT2 <- spTransform(nycsample2, CRS(proj4string(NYCtracts)))

class(nycsampleT2); class(NYCtracts)
join2 <- over(nycsampleT2, NYCtracts)
dim(join2); head(join2)

#Street-level calls
coordinates(nycsample3) <- c("Longitude", "Latitude")
proj4string(nycsample3) <- CRS("+proj=longlat")
nycsampleT3 <- spTransform(nycsample3, CRS(proj4string(NYCtracts)))

class(nycsampleT3); class(NYCtracts)
join3 <- over(nycsampleT3, NYCtracts)
dim(join3); head(join3)

#Create dataframes with tract IDs assigned to calls
NYC311full1 <- data.table(cbind(as.data.frame(nycsample1), join1))
NYC311full2 <- data.table(cbind(as.data.frame(nycsample2), join2))
NYC311full3 <- data.table(cbind(as.data.frame(nycsample3), join3))

###########################################
# 3. Create tract-level counts of call types, by time periods
###########################################

#Format dates
NYC311full1[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full1[, callday := as.character(format(calldate, '%m/%d/%Y'))]
NYC311full2[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full2[, callday := as.character(format(calldate, '%m/%d/%Y'))]
NYC311full3[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full3[, callday := as.character(format(calldate, '%m/%d/%Y'))]

###Create time dummies
#Around Census
NYC311full1[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
NYC311full2[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
NYC311full3[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
#Before Census
NYC311full1[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
NYC311full2[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
NYC311full3[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
##full year 2010
NYC311full1[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
NYC311full2[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
NYC311full3[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
#around election
NYC311full1[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
NYC311full2[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
NYC311full3[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
#6 months before election
NYC311full1[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
NYC311full2[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
NYC311full3[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
#3 months before election
NYC311full1[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]
NYC311full2[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]
NYC311full3[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]
##full year 2012
NYC311full1[calldate > "2012-01-01" & calldate < "2013-01-01" , allyear_2012 := 1]
NYC311full2[calldate > "2012-01-01" & calldate < "2013-01-01" , allyear_2012 := 1]
NYC311full3[calldate > "2012-01-01" & calldate < "2013-01-01" , allyear_2012 := 1]
##full election cycle 2010, roughly
NYC311full1[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
NYC311full2[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
NYC311full3[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
##full election cycle 2012, roughly
NYC311full1[calldate > "2011-01-01" & calldate < "2013-01-01" , electioncycle_2012 := 1]
NYC311full2[calldate > "2011-01-01" & calldate < "2013-01-01" , electioncycle_2012 := 1]
NYC311full3[calldate > "2011-01-01" & calldate < "2013-01-01" , electioncycle_2012 := 1]
#around 2012 election
NYC311full1[calldate > "2012-10-23" & calldate < "2012-11-20" , election2012_1montharound := 1]
NYC311full2[calldate > "2012-10-23" & calldate < "2012-11-20" , election2012_1montharound := 1]
NYC311full3[calldate > "2012-10-23" & calldate < "2012-11-20" , election2012_1montharound := 1]
#6 months before 2012 election
NYC311full1[calldate > "2012-05-05" & calldate < "2012-11-06" , election2012_6monthsbefore := 1]
NYC311full2[calldate > "2012-05-05" & calldate < "2012-11-06" , election2012_6monthsbefore := 1]
NYC311full3[calldate > "2012-05-05" & calldate < "2012-11-06" , election2012_6monthsbefore := 1]
#3 months before 2012 election
NYC311full1[calldate > "2012-08-05" & calldate < "2012-11-06" , election2012_3monthsbefore := 1]
NYC311full2[calldate > "2012-08-05" & calldate < "2012-11-06" , election2012_3monthsbefore := 1]
NYC311full3[calldate > "2012-08-05" & calldate < "2012-11-06" , election2012_3monthsbefore := 1]


#now collapse to tract-level counts of calls in each category.
tract311calls <- NYC311full1[, list(census_1montharound = sum(census_1montharound, na.rm=T), census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), allyear_2010 = sum(allyear_2010, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T), allyear_2012 = sum(allyear_2012, na.rm=T), election2012_1montharound = sum(election2012_1montharound, na.rm=T), election2012_3monthsbefore = sum(election2012_3monthsbefore, na.rm=T), election2012_6monthsbefore = sum(election2012_6monthsbefore, na.rm=T),electioncycle_2010 = sum(electioncycle_2010, na.rm=T), electioncycle_2012 = sum(electioncycle_2012, na.rm=T)), by= list(GEOID10)]
#at this point, can save interim dataset
#save(tract311calls, file="tract311calls.Rdata")

tract311calls_public <- NYC311full2[, list(census_1montharound = sum(census_1montharound, na.rm=T), census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), allyear_2010 = sum(allyear_2010, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T), allyear_2012 = sum(allyear_2012, na.rm=T), election2012_1montharound = sum(election2012_1montharound, na.rm=T), election2012_3monthsbefore = sum(election2012_3monthsbefore, na.rm=T), election2012_6monthsbefore = sum(election2012_6monthsbefore, na.rm=T), electioncycle_2012 = sum(electioncycle_2012, na.rm=T), electioncycle_2010 = sum(electioncycle_2010, na.rm=T)), by= list(GEOID10)]
#at this point, can save interim dataset
#save(tract311calls_public, file="tract311calls_publiconly.Rdata")

tract311calls_street <- NYC311full3[, list(census_1montharound = sum(census_1montharound, na.rm=T), census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), allyear_2010 = sum(allyear_2010, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T), allyear_2012 = sum(allyear_2012, na.rm=T), election2012_1montharound = sum(election2012_1montharound, na.rm=T), election2012_3monthsbefore = sum(election2012_3monthsbefore, na.rm=T), election2012_6monthsbefore = sum(election2012_6monthsbefore, na.rm=T), electioncycle_2012 = sum(electioncycle_2012, na.rm=T), electioncycle_2010 = sum(electioncycle_2010, na.rm=T)), by= list(GEOID10)]
#at this point, can save interim dataset
#save(tract311calls_street, file="tract311calls_streetonly.Rdata")

###########################################
# 4. Merge tract-level call data with census return rates
###########################################

#GEOID10 is 2010 census tract fips code.
head(tract311calls)
tract311calls[, TRACTID := as.character(GEOID10)]

#Load tract level census return rate data from the 2012 Census Planning Dataset, available at http://www.census.gov/research/2012_planning_database/, accessed 24 OCtober 2014.
load("NYCtract2014censusplanningdb.Rdata")
NYC[,TRACTID:= as.character(GIDTR)]
#Merge
censusand311 <- merge(NYC, tract311calls, by="TRACTID")
dim(NYC); dim(tract311calls); dim(censusand311) #quite good.

#Why does 311 data appear to have more tracts than the NYC census file suggests? 
all <- tract311calls$TRACTID
ny <- NYC$TRACTID
diff <- all[!all %in% ny]
countyfips <- substr(diff, 3,5); table(countyfips)
#these are in Nassau County, Saratoga, Schenectady, Schoharie, and Westchester.
#So either people are calling about stuff on long island/elsewhere, or the calls are miscoded.  
#the only real difference here is between the number of merged tracts (2164) and the original number (2169).  So 4 or 5 tracts don't have calls.

#Merge tract-level population downloaded from AmericanFactfinder on 29 Oct 2014
pop1 <- read.csv("./censuspop_tracts/DEC_10_SF1_SF1DP1_with_ann.csv", stringsAsFactors=F)
#call out population data 
tractpop <- data.table(pop1[-1,1:4]) 
setnames(tractpop, "GEO.id2", "TRACTID")
setnames(tractpop, "HD01_S001", "population_census10")
census311pop <- merge(censusand311, tractpop, by="TRACTID")
dim(census311pop); dim(tractpop); dim(censusand311) #all tracts carried over

write.csv(census311pop, file="merged_NYCcensustractdata.csv")

##Pull out identifiers for tracts with no population in census data
#nopop <- subset(census311pop, census311pop$population_census10 <1)
#dim(nopop)
#mean(nopop$census_3monthsbefore)
#problemtracts <- nopop$TRACTID
#write.table(problemtracts, file="problemtracts_nopop.txt")

#and also merge tract-level political donation counts
load("NYCdonations2010_tractlevel.Rdata")
tractdonations[, TRACTID := as.character(GEOID10)]
census311popdon <- merge(census311pop, tractdonations, by="TRACTID", all.x=T)
dim(census311popdon); dim(census311pop); dim(tractdonations) 
write.csv(census311popdon, file="merged_NYCcensustractdata_poldonations.csv")

###Repeat merge for public calls
#GEOID10 is 2010 census tract fips code.
tract311calls_public[, TRACTID := as.character(GEOID10)]
#Merge fips codes
censusand311_public <- merge(NYC, tract311calls_public, by="TRACTID", all.x=T)
#Merge population
census311pop_public <- merge(censusand311_public, tractpop, by="TRACTID", all.x=T)

write.csv(census311pop_public, file="merged_NYCcensustractdata_publiconly.csv")
#and then donations
census311popdon_public <- merge(census311pop_public, tractdonations, by="TRACTID", all.x=T)
write.csv(census311popdon_public, file="merged_NYCcensustractdata_poldonations_publiconly.csv")

###Repeat merge for street-level calls
tract311calls_street[, TRACTID := as.character(GEOID10)]
censusand311_street <- merge(NYC, tract311calls_street, by="TRACTID", all.x=T)
census311pop_street <- merge(censusand311_street, tractpop, by="TRACTID", all.x=T)
write.csv(census311pop_street, file="merged_NYCcensustractdata_streetonly.csv")

census311popdon_street <- merge(census311pop_street, tractdonations, by="TRACTID", all.x=T)
write.csv(census311popdon_street, file="merged_NYCcensustractdata_poldonations_streetonly.csv")

###########################################
# 5. Assign precinct identifiers to calls
###########################################

#Clear up some space and variable names
rm(census311pop, census311pop_public, census311pop_street, censusand311, censusand311_public, censusand311_street, join1, join2, join3, NYCtracts, nycsampleT1, nycsampleT2,nycsampleT3, tract311calls,tract311calls_public, tract311calls_street, NYC311full1, NYC311full2, NYC311full3)
rm(nycsample.nogeo)

#Read in precinct shapefiles
setwd(paste(currwd,"/NYprecinctshapefiles",sep=""))
NYCprecincts <- readOGR(dsn=".", layer = "ny_final")
setwd(paste(currwd))
summary(NYCprecincts); proj4string(NYCprecincts)

##Assign precinct ID's to calls
#Full sample
#first, drop some extraneous variables to cut down on storage.
nycsampleT1 <- spTransform(nycsample1, CRS(proj4string(NYCprecincts)))
join1 <- over(nycsampleT1, NYCprecincts)
rm(nycsampleT1)
NYC311full1 <- data.table(cbind(nycsample1@data, join1))

#Public calls
nycsampleT2 <- spTransform(nycsample2, CRS(proj4string(NYCprecincts)))
join2 <- over(nycsampleT2, NYCprecincts)
rm(nycsampleT2)
NYC311full2 <- data.table(cbind(nycsample2@data, join2))

#Street level calls
nycsampleT3 <- spTransform(nycsample3, CRS(proj4string(NYCprecincts)))
join3 <- over(nycsampleT3, NYCprecincts)
NYC311full3 <- data.table(cbind(nycsample3@data, join3))


###########################################
# 6. Create precinct-level counts of call types, by time periods
###########################################

#Format dates
NYC311full1[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full1[, callday := as.character(format(calldate, '%m/%d/%Y'))]
NYC311full2[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full2[, callday := as.character(format(calldate, '%m/%d/%Y'))]
NYC311full3[, calldate := as.IDate(Created.Date, "%m/%d/%Y %I:%M:%S %p")]
NYC311full3[, callday := as.character(format(calldate, '%m/%d/%Y'))]

###Create time dummies
#Around Census
NYC311full1[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
NYC311full2[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
NYC311full3[calldate > "2010-03-14" & calldate < "2010-04-15" , census_1montharound := 1]
#Before Census
NYC311full1[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
NYC311full2[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
NYC311full3[calldate > "2010-01-01" & calldate < "2010-04-01" , census_3monthsbefore := 1]
## (Roughly) election cycle 2010
NYC311full1[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
NYC311full2[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
NYC311full3[calldate > "2009-01-01" & calldate < "2011-01-01" , electioncycle_2010 := 1]
##full year 2010
NYC311full1[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
NYC311full2[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
NYC311full3[calldate > "2010-01-01" & calldate < "2011-01-01" , allyear_2010 := 1]
#around election
NYC311full1[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
NYC311full2[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
NYC311full3[calldate > "2010-10-18" & calldate < "2010-11-16" , election2010_1montharound := 1]
#6 months before election
NYC311full1[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
NYC311full2[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
NYC311full3[calldate > "2010-05-01" & calldate < "2010-11-02" , election2010_6monthsbefore := 1]
#3 months before election
NYC311full1[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]
NYC311full2[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]
NYC311full3[calldate > "2010-08-01" & calldate < "2010-11-02" , election2010_3monthsbefore := 1]

NYC311full1[, NAME10 := as.character(NAME10)] #convert from factor
NYC311full2[, NAME10 := as.character(NAME10)] 
NYC311full3[, NAME10 := as.character(NAME10)] 

#now collapse to precinct-level counts of calls in each category.
precinct311calls <- NYC311full1[, list(census_1montharound = sum(census_1montharound, na.rm=T), census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), allyear_2010 = sum(allyear_2010, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T)), by= list(NAME10, POP100, VAP, SUM_VAP, VAP_SHARE)]
#at this point, can save interim dataset 
save(precinct311calls, file="precinct311calls.Rdata")

precinct311calls_public <- NYC311full2[, list(census_1montharound = sum(census_1montharound, na.rm=T), census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T)),by= list(NAME10, POP100, VAP, SUM_VAP, VAP_SHARE)]
#at this point, can save interim dataset
save(precinct311calls_public, file="precinct311calls_publiconly.Rdata")

precinct311calls_street <- NYC311full3[, list(census_1montharound = sum(census_1montharound, na.rm=T),  census_3monthsbefore = sum(census_3monthsbefore, na.rm=T), election2010_1montharound = sum(election2010_1montharound, na.rm=T), election2010_3monthsbefore = sum(election2010_3monthsbefore, na.rm=T), election2010_6monthsbefore = sum(election2010_6monthsbefore, na.rm=T)), by= list(NAME10, POP100, VAP, SUM_VAP, VAP_SHARE)]
#at this point, can save interim dataset
save(precinct311calls_street, file="precinct311calls_streetonly.Rdata")

###########################################
# 7. Merge precinct-level call data with turnout rates
###########################################

#Load precinct level vote data for NY state, "New York Data Files" by Ansolabehere and Rodden 2011, accessed at https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/16320 on 10 Dec 2014
vote1 <- read.table("NY_2010_precinctvote.csv", sep="\t", stringsAsFactors=F, header=T)
#keep only counties in NYC
vote1 <- vote1[vote1$county %in% c(5, 47, 61, 81, 85), ]

#Merge all calls with turnout
vote1$edc5 <- formatC(vote1$ed, width = 5, format = "d", flag = "0")
vote1$NAME10 <- paste(as.character(vote1$ad), vote1$edc5, sep="")

length(unique(vote1$NAME10)); length(unique(precinct311calls$NAME10))
precinct311calls[,callsfile:=1]
vote1 <- vote1[, c(1:8, 45:70)] #drop some unneeded cols
vote1 <- as.data.table(vote1)
vote1[,votefile:=1]
callsvote <- merge(vote1, precinct311calls, by="NAME10", all=T)
dim(callsvote) 
probs <- callsvote[is.na(callsfile)| is.na(votefile),] #anything that didn't merge (from either direction)
table(probs$callsfile)#about half--that's a little surprising to me.

#go through all the callfile rows from that.  For each one, slice it out into the component strings.
#then, for each of those groups of strings, go into the rows of vote1, and collapse (sum) all the turnout and pop. variables.
#so: within vote1, create a new NAME10 var that will match up properly with the vote1 NAME10, and replace it with those weird long strings as needed.
vote1[, oldNAME10 := NAME10]
vote1[, oldvtdid := vtdid]

callprobs <- probs[callsfile==1]; callprobs$vtdid #shit, none of them have it.
callstrings <- probs[callsfile==1]$NAME10; length(callstrings)
#okay, so: there are spaces (this is easy), and then there are hyphens (like 6800061-6800063) that indicate a range--trickier.
#want to expand the hyphens.  could either do this formulaically (convert them to numeric, have R do the range), or manually.
#It's over 20 cases?  Code it below (not efficiently, but easy to interpret)
for (i in 1:length(callstrings)){ #loop over each of these ugly combined strings
	string <- callstrings[i]
	vtdidnew <- callprobs$vtdid[i] #will need this later.
	strings <- unlist(strsplit(string, split=" ")) #split by spaces, first.
	for (j in 1:length(strings)){ #now go into resulting vector & look for hyphens
		component <- strings[j]
		if (grepl("-", component, fixed = FALSE) == F) 	#if find hyphens, expand
			next #otherwise, keep going.
		sp.comp <- strsplit(component, split="-") #split by hyphen
		extras <- as.character(as.numeric(sp.comp[[1]][1]):as.numeric(sp.comp[[1]][2])) #and expand, then convert back to a string
	#once expanded, add everything onto strings (doesn't matter if things are kind of duplicated)
		strings <- c(strings, extras)
	}
#okay, now that we have "strings" all set up usefully, go into vote1 and replace NAME10 accordingly (for all rows where the old NAME10 is found within strings)
matches <- vote1[NAME10 %in% strings]$NAME10
vote1[oldNAME10 %in% strings, NAME10:= string] #for these matches, drop in the weird shapefile version instead.
vote1[oldNAME10 %in% strings, vtdid:= vtdidnew] #do this too-- we'll need these to match up later for the merge with registration/pop data.
}

#oh, and now we need to collapse vote1 variables by the new NAME10
dim(vote1); length(unique(vote1$NAME10))
vote2 <- vote1[,list(g2010_uss_tv=sum(g2010_uss_tv), g2010_ush_tv=sum(g2010_ush_tv), g2010_gov_tv=sum(g2010_gov_tv)), by=list(state, year, vtdid, NAME10)]
dim(vote2) #it's very weird to me that there aren't duplicates here.  I guess I didn't need this step? hm.
vote2[,votefile:=1]

callsvote2 <- merge(vote2, precinct311calls, by="NAME10", all=T)
dim(callsvote2) #WAY WAY better.
probs2 <- callsvote2[is.na(callsfile)| is.na(votefile),] #anything that didn't merge (from either direction)
probs2v <- callsvote2[is.na(callsfile),]
probs2c <- callsvote2[is.na(votefile),] 
sum(probs2c$allyear_2010) #these places only had a total of 264 calls in 2010? NOT WORRIED. drop them.

callsvote <- merge(vote2, precinct311calls, by="NAME10") #5721 precincts.

### downloaded some LATFOR data on registration. (from here: http://www.latfor.state.ny.us/data/?sec=2010vote)
latfor <- read.csv("vtd10vote.csv", stringsAsFactors=F)
#merge in: use the common columns from the vote data merged in: try vtdid.
setnames(callsvote, "vtdid", "VTDID")
miss <- callsvote[is.na(VTDID)]

callsvotereg <- merge(callsvote, latfor, by="VTDID", all.x=T)
dim(latfor); dim(callsvote); dim(callsvotereg)


save(callsvotereg, file="precinctlevel311votingdata_mergeddec14.RData")
library(foreign)
write.csv(callsvotereg, file="precinctlevel311votingdata_mergeddec14.csv")

callsvote_public <- merge(vote2, precinct311calls_public, by="NAME10")
dim(callsvote_public) 

setnames(callsvote_public, "vtdid", "VTDID")
callsvotereg_public <- merge(callsvote_public, latfor, by="VTDID", all.x=T)
dim(latfor); dim(callsvotereg_public); dim(callsvotereg_public)

save(callsvotereg_public, file="precinctlevel311votingdata_mergeddec14_public.RData")
library(foreign)
write.csv(callsvotereg_public, file="precinctlevel311votingdata_mergeddec14_publiconly.csv")

callsvote_street <- merge(vote2, precinct311calls_street, by="NAME10")
dim(callsvote_street) 

setnames(callsvote_street, "vtdid", "VTDID")
callsvotereg_street <- merge(callsvote_street, latfor, by="VTDID", all.x=T)
dim(latfor); dim(callsvote_street); dim(callsvotereg_street)

save(callsvotereg_street, file="precinctlevel311votingdata_mergeddec14_street.RData")
library(foreign)
write.csv(callsvotereg_street, file="precinctlevel311votingdata_mergeddec14_streetonly.csv")





